##############################
# Media Measurement Matters  #
# Replication Code           #
# Survey Data Preparation    #
##############################

# The following file contains code to clean the survey data, calculate our
# two main summary measures (the attitudinal and behavioral indices), code
# pre-treatment demographics, and append revealed preference scores. In addition,
# it generates a table of sample demographics, along with balance tables for the
# two aspects of our randomization procedure (randomization to the free- or forced-
# choice group and randomization to a given media in the forced-choice group).

# Set-up ----

# Clear out environment
rm(list=ls())

# If desired, set up path into which to save plots and tables
plot_path <- NULL
table_path <- NULL

# Load packages
library(tidyverse)
library(psych)
library(MASS, exclude = "select")
library(ltm)
library(GPArotation)
library(xtable)
library(Matching)
library(ebal)
library(datasets)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Read in raw survey data 
df <- read_csv("data/survey_data.csv")
dim(df)

# Set seed 
set.seed(531)

# Attitudinal index ----

# Function to reverse-code and normalize variables
  # variable = variable name (e.g., educ_costmore)
  # rev      = whether to reverse-code item (default = reverse-code)
  # action   = whether variable is associated with action index
rev_norm <- function(variable, rev = T, action = F){
  # For attitudinal index, if reverse-coded item, subtract variable scores from 7
  # and divide by 6 to get a normalized score from 0-1 where higher ratings = 
  # more conservative responses
  
  # If the item is not reverse-coded, subtract 1 and divide by 6 to normalize
  # the value from 0-1 (where higher scores again correspond to more conservative responses)
  
  # If the item is part of the action index, set "not sure" responses to NA, recode
  # values to range from 0-3, and divide by 3 to normalize from 0-1 (where higher ratings
  # = more willingness to engage)
  if(rev == T & action == F){
    return((7 - variable)/6)
  }else if(action == F){
    return((variable - 1)/6)
  }else{
    return(case_when(variable == 7 ~ NA_real_,
              variable == 1 ~ 3, variable == 2 ~ 2,
              variable == 3 ~ 1, variable == 4 ~ 0)/3)
  }
}

# Reverse-code relevant items and normalize all to range from 0 to 1
df <- df %>% 
  mutate(educ_costmore = rev_norm(educ_costmore, rev = T),
         vouchers_succ = rev_norm(vouchers_succ, rev = T),
         vouchers_wrong = rev_norm(vouchers_wrong, rev = F),
         charter_takemoney = rev_norm(charter_takemoney, rev = F),
         charter_parents = rev_norm(charter_parents, rev = T),
         charter_tradeoff = rev_norm(charter_tradeoff, rev = F),
         charter_system = rev_norm(charter_system, rev = T),
         charter_expensive = rev_norm(charter_expensive, rev = T),
         public_funding = rev_norm(public_funding, rev = F),
         charter_support = rev_norm(charter_support, rev = T),
         private_choice = rev_norm(private_choice, rev = T),
         teacher_salary = rev_norm(teacher_salary, rev = T))

dvs <- c("educ_costmore", "vouchers_succ", "vouchers_wrong", "charter_takemoney",
         "charter_parents", "charter_tradeoff", "charter_system", "charter_expensive",
         "public_funding", "charter_support", "private_choice", "teacher_salary")

# Code mean index
df <- df %>% 
  mutate(charter_index = rowMeans(df %>% select(all_of(dvs)), na.rm = T))

# Calculate Cronbach's alpha (0.91)
summary(alpha(df %>% select(all_of(dvs))))

# Behavioral index ----

# Reverse code the behavioral intentions items, remove "Not Sure", and normalize
df <- df %>% 
  mutate(actions_discuss = rev_norm(actions_discuss, action = T),
         actions_forward = rev_norm(actions_forward, action = T),
         actions_post = rev_norm(actions_post, action = T),
         actions_seek = rev_norm(actions_seek, action = T))

action_dvs <- paste0("actions_", c("discuss", "forward", "post", "seek"))

# Code mean index
df <- df %>% 
  mutate(actions_index = rowMeans(df %>% select(all_of(action_dvs)), na.rm = T))

# Calculate Cronbach's alpha (0.84)
summary(alpha(df %>% select(all_of(action_dvs))))

# Treatment indicators ----

# Create indicators of treatment status
df <- df %>% 
  mutate(med_pref = case_when(med_pref == 1 ~ "MSNBC", med_pref == 2 ~ "Fox",
                              med_pref == 3 ~ "Entertainment"),
         med_choice = case_when(med_choice == 1 ~ "MSNBC", med_choice == 2 ~ "Fox",
                                med_choice == 3 ~ "Entertainment"),
         article_read = case_when(forcedmedia == 1 ~ "Fox", forcedmedia == 2 ~ "MSNBC",
                                 forcedmedia == 3 ~ "Entertainment"),
         article_forced = case_when(forcedchoice == 1 ~ article_read,
                                    forcedchoice == 0 ~ NA_character_),
         article_read = factor(article_read, levels = c("MSNBC", "Fox", "Entertainment")),
         article_forced = factor(article_forced, levels = c("MSNBC", "Fox", "Entertainment")))

# Assign numeric values to each stated preference group
df <- df %>% 
  mutate(med_pref_num = factor(med_pref, levels = c("MSNBC", "Entertainment", "Fox"),
                           labels = 1:3))

# Manipulation checks ----

# Reverse-code manipulation checks
  # mc_charter: whether articles support/oppose charter schools/vouchers (higher ratings = more support)
  # mc_ideo: whether articles perceived as liberal/conservative (higher ratings = more conservative)
df <- df %>% 
  mutate(mc_charter = (mc_charter - 1)/4,
         mc_ideo = (mc_ideo - 1)/6)

df %>% 
  filter(!is.na(article_read) & article_read != "Entertainment") %>% 
  group_by(article_read) %>% 
  summarise(mean_support = mean(mc_charter, na.rm = T),
            mean_ideo = mean(mc_ideo, na.rm = T))

  # As expected, respondents perceive the MSNBC articles as much less supportive of 
  # charter schools, much more liberal, relative to the Fox News articles.

# Demographics ----

# Code political knowledge
df <- df %>% 
  mutate(polinfo1_cor = as.numeric(polinfo1 == 3),
         polinfo2_cor = as.numeric(polinfo2 == 1),
         polinfo3_cor = as.numeric(polinfo3 == 1),
         polinfo4_cor = as.numeric(polinfo4 == 1),
         polinfo5_cor = as.numeric(polinfo5 == 3))

df <- df %>% 
  mutate(polinfo = rowMeans(df %>% select(all_of(paste0("polinfo", 1:5, "_cor")))))

# Function to normalize variables to range from 0 to 1
normalized <- function(x, ...) {(x - min(x, ...)) / (max(x, ...) - min(x, ...))}

# Code demographic variables
df <- df %>% 
  mutate(# Personal demographics
         age = 2017 - yob, 
         age_cat = factor(age_cat, levels = 3:8, 
                          labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),
         educ_bin = case_when(educ >= 4 ~ 1, TRUE ~ 0),
         educ_cat = case_when(educ <= 2 ~ "High school or less",
                              educ == 3 ~ "Some college", 
                              educ <= 5 ~ "College degree", 
                              TRUE ~ "Post-graduate degree"),
         educ_cat = factor(educ_cat, levels = c("High school or less", "Some college",
                                                "College degree", "Post-graduate degree")),
         race_sum = rowSums(df %>% select(paste0("race_", 1:7))),
         race_cat = case_when(race_sum > 1 ~ "Other", race_3 == 1 ~ "White",
                              race_1 == 1 ~ "Black", race_4 == 1 ~ "Hispanic/Latino",
                              race_1 != 1 ~ "Other"),
         race_cat = factor(race_cat, levels = c("White", "Black", "Hispanic/Latino", "Other")),
         race_black = ifelse(race_cat == "Black", 1, 0),
         race_white = ifelse(race_cat == "White", 1, 0),
         inc_bin = case_when(income == 15 ~ NA_real_, income >= 8 ~ 1, TRUE ~ 0),
         inc_cat = case_when(income <= 7 ~ "<$50K", income <= 11 ~ "$50-100K",
                             income <= 14 ~ "$100K+"),
         inc_cat = factor(inc_cat, levels = c("<$50K", "$50-100K", "$100K+")),
         # Stated news consumption
         days_bin = case_when(days == 8 ~ 1, days < 8 ~ 0),
         days_norm = normalized(days, na.rm = T),
         # Political interest
         int = 6 - int, # Reverse-code so that higher ratings = more interested 
         int_norm = normalized(int, na.rm = T),
         # Partisanship and ideology
         pid7 = case_when(party2 == 1 ~ -3, party2 == 2 ~ -2,
                          party4 == 2 ~ -1, party4 == 3 ~ 0, party4 == 1 ~ 1,
                          party3 == 2 ~ 2, party3 == 1 ~ 3),
         pid = case_when((party1 == 1 | party4 == 2) ~ -1,
                         party4 == 3 ~ 0,
                         (party1 == 2 | party4 == 1) ~ 1),
         ideo7 = case_when(ideo2 == 1 ~ -3, ideo2 == 2 ~ -2,
                           ideo4 == 1 ~ -1, ideo4 == 3 ~ 0, ideo4 == 2 ~ 1,
                           ideo3 == 2 ~ 2, ideo3 == 1 ~ 3),
         ideo = case_when((ideo1 == 1 | ideo4 == 1) ~ -1,
                          ideo4 == 3 ~ 0,
                          (ideo1 == 2 | ideo4 == 2) ~ 1))

# Append information about geographic region

# Load in dataset with information about geographic regions
states <- cbind.data.frame(state.name, state.region) %>% 
  mutate(state.region = factor(state.region, levels = c("Northeast",
                                                        "North Central",
                                                        "South", 
                                                        "West")),
         region_num = as.numeric(state.region)) %>% 
  bind_rows(data.frame(state.name = "District of Columbia", 
                       state.region = "South", region_num = 3)) %>% 
  mutate(qstate = row_number())

# Merge into dataset
df <- df %>% 
  left_join(states, by = "qstate")

# Calculated revealed pref. scores ----

# > Revealed slant - BMA ----

# Read in web data
web <- read_rds("data/web_data.rds")

# Calculate alignment scores for each respondent
score_calc <- function(df = web, score = "bma", score_name = "score",
                       score_var_name = "score_var"){
  if(score == "bma"){
    score_use <- "avg_align"
  }else{
    score_use <- "zeta"
  }
  
  scores <- df %>% 
    group_by(resp_id) %>% 
    mutate(score_use = get(score_use)) %>% 
    summarise(score = mean(score_use, na.rm = T),
              score_var = var(score_use, na.rm = T))
  
  colnames(scores) <- c("resp_id", score_name, score_var_name)
  
  return(scores)
}

# Scores including portals
scores <- score_calc()

# Scores without portals
scores_noportals <- score_calc(df = web %>% filter(domain_recode %notin% c("aol.com", "msn.com", "google.com")),
                               score_name = "score_noportals", score_var_name = "score_var_noportals")

# Scores without portals and Yahoo! News
scores_yahoo <- score_calc(df = web %>% filter(domain_recode %notin% c("aol.com", "msn.com", "google.com",
                                                                       "news.yahoo.com", "gma.yahoo.com")),
                               score_name = "score_yahoo", score_var_name = "score_var_yahoo")

scores <- scores %>% 
  left_join(scores_noportals, by = "resp_id") %>% 
  left_join(scores_yahoo, by = "resp_id") %>% 
  drop_na(score)

# Append number of news domain visits (even if not matched to BMA scores)
dom_visits <- web %>% 
  group_by(resp_id) %>% 
  summarise(dom_visits = n())

scores <- scores %>% 
  left_join(dom_visits, by = "resp_id")

# Append number of visits matched to BMA scores
match_visits <- web %>% 
  drop_na(avg_align) %>% 
  group_by(resp_id) %>% 
  summarise(matched_visits = n())

scores <- scores %>% 
  left_join(match_visits, by = "resp_id")

# Merge scores into the survey dataset
df <- df %>% 
  left_join(scores, by = "resp_id")

# > Revealed slant - Eady ----

# Scores for Eady et al. data
scores_eady <- score_calc(web %>% filter(domain_recode %notin% 
                                           c("aol.com", "msn.com", "google.com")),
                          score = "eady", score_name = "score_eady", 
                          score_var_name = "score_var_eady")

df <- df %>% 
  left_join(scores_eady, by = "resp_id")

# > Revealed volume - BMA ----

# Read in news consumption data
consump <- read_csv("data/news_consump.csv", na = c("#N/A", "", NA))

# Code respondents has having 0% news consumption if they had no recorded news events
consump[is.na(consump$`News Events`), "News Events"] <- 0
consump[is.na(consump$`News Events`), "X..News.Consumption"] <- "0%"
 
# Calculate a new value for news consumption
consump <- consump %>% 
  mutate(news_events = as.numeric(`News Events`),
         total_events = as.numeric(`Total Events`),
         news_consump = `News Events`/`Total Events`) 

# Merge in consumption scores
df <- df %>% 
  left_join(consump %>% select(resp_id = Resp_ID, news_consump, news_events, 
                               total_events), by = "resp_id")

# Sample size
(n_overall <- nrow(df)) # Number of respondents who began survey
(n_consump <- nrow(consump)) # Number of respondents who opted into web-tracking
(n_srvy <- sum(!is.na(df$score))) # Number of respondents who had at least one visit match to BMA

# Calculate revealed pref. scores (hard news only)----

# > Revealed slant ----

# Create filtered version of dataset with just the hard news URLs
web_hard <- web %>% 
  filter(hard_news == 1)

# Remove portal sites 
web_noport_hard <- web_hard %>% 
  filter(domain_recode %notin% c("aol.com", "msn.com", "google.com") &
           !is.na(avg_align))

# Calculate hard news scores
scores_hard <- web_noport_hard %>% 
  group_by(resp_id) %>% 
  summarise(score_hard = mean(avg_align, na.rm = T),
            score_var_hard = var(avg_align, na.rm = T)) %>% 
  as.data.frame()

df <- df %>% 
  left_join(scores_hard, by = "resp_id")

# > Revealed volume ----

# Load in unprocessed web data (with sequential duplicates)
web_full <- read_rds("data/web_raw.rds")

mean_hard_consump <- mean(web_full$hard_news)
sum_hard_consump <- sum(web_full$hard_news)

# Hard news visits
(mean_hard_news_consump <- mean_hard_consump) # % of hard news visits in full sample
(n_hard_news_consump <- sum_hard_consump) # Number of hard news visits in full sample
(n_full <- nrow(web_full)) # Number of total news visits, before removing sequential dupes

# Merge in the total number of hard news events for each respondent into the
# consumption data
news_events <- web_full %>% 
  filter(hard_news == 1) %>% 
  group_by(Resp_ID = resp_id) %>% 
  summarise(hard_events = n()) 

consump <- consump %>% 
  left_join(news_events, by = "Resp_ID") %>% 
  mutate(hard_perc = hard_events/total_events)

# Merge results with the survey data
consump_hard <- consump %>% 
  select(resp_id = Resp_ID,
         hard_events, hard_perc)

df <- df %>% 
  left_join(consump_hard, by = "resp_id")

# Look at distribution of news consumption for hard news URLs
(mean_perc_hard <- mean(df$hard_perc, na.rm = T)) # Average % of web visits to hard news
(med_perc_hard <- median(df$hard_perc, na.rm = T)) # Median % of web visits to hard news

# Generate binned scores ----

# > Revealed volume ----

# Generate terciles of news consumption
df <- df %>% 
  mutate(news_consump_bin3 = ntile(news_consump, 3))

# > Revealed slant ----

# Identify alignment scores for two exemplar sites: cnn.com, yahoo.com/news
align_scores <- web %>% 
  group_by(domain_recode) %>% 
  summarise(avg_align = mean(avg_align)) 

cnn <- align_scores %>% filter(domain_recode == "cnn.com") %>% pull(avg_align)
yahoo <- align_scores %>% filter(domain_recode == "news.yahoo.com") %>% pull(avg_align)

# Split respondents into three groups, based on how their revealed preference group
# compares to these exemplar sites
  # 1 = most liberal media diets
  # 2 = moderate/heterogeneous media diets
  # 3 = most conservative media diets
df <- df %>% 
  mutate(score_code = case_when(is.na(score_noportals) ~ NA_real_,
                                score_noportals < cnn ~ 1,
                                score_noportals < yahoo ~ 2, TRUE ~ 3), # All URLs
         score_code_hard = case_when(is.na(score_hard) ~ NA_real_,
                                     score_hard < cnn ~ 1,
                                     score_hard < yahoo ~ 2, TRUE ~ 3)) # Hard news only

# Save cleaned dataset ----

df_save <- df %>% 
  select(resp_id, article_read, article_forced, forcedchoice, med_pref, med_pref_num,
         med_choice, charter_index, actions_index, all_of(dvs), all_of(action_dvs),
         mc_charter, mc_ideo, gender, age, age_cat, educ, educ_bin, educ_cat, 
         race_cat, race_white, income, inc_bin, inc_cat, days, days_bin, 
         days_norm, int, int_norm, polinfo, pid, pid7, ideo, ideo7, qstate, region_num,
         score, score_var, score_noportals, score_var_noportals, score_yahoo, score_var_yahoo,
         dom_visits, matched_visits, score_eady, score_var_eady, news_consump,
         news_events, total_events, score_hard, score_var_hard, hard_events, hard_perc,
         news_consump_bin3, score_code, score_code_hard)

write_csv(df_save, "data/survey_data_cleaned.csv")
write_rds(df_save, "data/survey_data_cleaned.rds")

# > Append demographic data to web visits ----

# Merge in partisanship, ideology, and stated media preferences to the web-browsing data
web <- web %>% 
  left_join(df %>% select(resp_id, pid, ideo, med_pref), by = "resp_id")

# Save web data with demographics included
write_rds(web, "data/web_use.rds")

# Sample demographics ----

# Function to calculate sample proportions
  # variable = demographic variable for which to calculate marginal distribution
  # position = if summing over multiple categories
  # df       = dataframe to use for calculating sample demographics
prop_func <- function(variable, position, df = df){
  if (length(position) == 1){
    out <- as.numeric(prop.table(table(df[, variable]))[position])
  }
  else {
    out <- sum(as.numeric(prop.table(table(df[, variable]))[position]))
  }
  
  return(out)
}

# Variables to include in sample demographics table
  # gender = gender, age_cat = categorical age, educ_cat = categorical education,
  # inc_cat = income (excluding prefer not to say), pid = party ID, ideo = ideology,
  # med_pref = stated media preferences
variables <- c("gender", "age_cat", "race_cat", "educ_cat", "inc_cat",
               "pid", "ideo", "med_pref")

# Calculate proportion of sample falling into each unique category for each of these
# demographic variables
props <- unlist(lapply(variables, function(y){
  srvy_na <- df %>% filter(!is.na(get(y))) %>% as.data.frame()
  
  unlist(lapply(1:length(unique(srvy_na[, y])), function(x){
    prop_func(variable = y, x, df = srvy_na)
  }))
}))

# Specify labels for each category
labels <- c("Male", "Female", "18-24", "25-34", "35-44", "45-54", "55-64", "65+",
            "White", "Black", "Hispanic/Latino", "Other Race/Ethnicity", 
            "High School or Less", "Some College", "College Degree", "Post-Graduate Degree",
            "<$50K", "$50-100K", ">$100K", "Democrat", "Independent", "Republican",
            "Liberal", "Moderate", "Conservative", "Entertainment", "Fox", "MSNBC")

# Bind together demographic categories and their corresponding % in sample
samp_dem <- cbind.data.frame(Category = labels, 
                             Percent = paste0(sprintf("%.1f", round(props * 100, 1)), "%"))

# Append an overarching category label for each variable (e.g., gender, age)
samp_dem$Variable <- c("Gender", "", "Age", rep("", 5), "Race/Ethnicity", rep("", 3),
                       "Education", rep("", 3), "Income", rep("", 2),
                       "Party ID", rep("", 2), "Ideology", rep("", 2), "Stated Preference", rep("", 2))

# Table A1: output sample demographics as a table
bold <- function(x) {paste('{\\textbf{',x,'}}', sep ='')}
print(xtable(samp_dem %>% select(Variable, Category, Percent),
             align = c(rep("l", 3), "c"), digits = 1,
             caption = "Demographic composition of survey sample ($n = 3513$). Percents may not add up to 100 due to rounding.",
             label = "samp_perc"), include.rownames = F,
      sanitize.colnames.function=bold, 
      hline.after = c(-1,0, 2, 8, 12, 16, 19, 22, 25, 28),
      booktabs = T, scalebox = 0.6, 
      file = paste0(table_path, "/tab_a1.tex"))

# Balance tables ----

# > Free vs. forced choice ----

# Balance between free- and forced-choice groups (using t-tests)
group_vars = c("gender", "age_cat", "race_cat", "educ_cat", "inc_cat",
               "pid", "ideo", "med_pref")

group_means <- as.data.frame(do.call("rbind", lapply(group_vars, function(x){
  # For each variable, filter out missing data
  filt <- df %>% 
    mutate(group = get(x)) %>% 
    filter(!is.na(group))
  
  # Calculate the proportions within each group
  out <- filt %>%
    group_by(forcedchoice, group) %>%
    summarise(n = n()) %>%
    mutate(freq = n / sum(n)) %>% 
    select(forcedchoice, group, freq) %>% 
    pivot_wider(names_from = forcedchoice, values_from = freq) %>% 
    as.data.frame()
  
  colnames(out) <- c("Category", "Free-Choice", "Forced-Choice")
  
  # Identify the number of unique groups
  filt$group <- factor(filt$group)
  groups <- levels(filt$group)
  
  # Run t-tests comparing each variable across groups
  out$p <- sapply(groups, function(y){
    t.test((group == y) ~ forcedchoice, data = filt)$p.value
  })
  
  out
})))

group_means$Category <- labels

# Table A2: balance between free- and forced-choice groups
print(xtable(group_means %>% rename("\\textit{\\textbf{p}}" = p), digits = 2, 
             align = c(rep("r", 2), rep("c", times = 3)),
             caption = "Demographic balance of the free- versus forced-choice conditions. Reported $p$ values come from independent samples t-tests. For income, partisanship, and ideology, missing data are excluded.",
             label = "bal_forcefree"),
      hline.after = c(-1, 0, 2, 8, 12, 16, 19, 22, 25, 28),
      sanitize.colnames.function=bold, include.rownames = F,
      booktabs = T, scalebox = 0.65, 
      file = paste0(table_path, "/tab_a2.tex"))

# Robustness check: assessing balance using Matching/ebal packages
form.free <- as.formula(forcedchoice ~ age + (gender == 1) + race_white + (educ > 3) + 
                          (income > 7) + factor(pid, levels = c(0, -1, 1)) +
                          factor(ideo, levels = c(0, -1, 1)) + 
                          factor(med_pref, levels = c("Entertainment",
                                                      "Fox", "MSNBC")))

matchbal.free <- MatchBalance(form.free, data = df, digits = 2, print.level = 0) 

matchbal.free.table <- baltest.collect(matchbal.free, 
                                       var.names = c("Age", "Male", "White", 
                                                     "College Degree or More",
                                                     "$50K or More",
                                                     "Democrat", "Republican",
                                                     "Liberal", "Conservative",
                                                     "Prefer Fox", "Prefer MSNBC"), after = F)

(matchbal.free.table <- matchbal.free.table[ ,c(1:2,6)] %>% as.data.frame())
colnames(matchbal.free.table) <- c("Forced-Choice", "Free-Choice", "\\textit{\\textbf{p}}")
matchbal.free.table$Category <- rownames(matchbal.free.table)

print(xtable(matchbal.free.table[,c(4, 1:3)], digits = 2, 
             align = c(rep("l", 2), rep("c", 3)),
             caption = "Demographic balance of the free- versus forced-choice conditions, calculated using the \\texttt{Matching} and \\texttt{ebal} packages in \\texttt{R}. The reference categories are ``Independent'' for partisanship, ``Moderate'' for ideology, and ``Prefer Entertainment'' for stated media preferences. All remaining variables (other than age) are treated as binary. Respondents who have missing data for one or more of these variables are excluded from the reported results.",
             label = "bal_forcefree"),
      hline.after = c(-1, 0, 1, 2, 3, 4, 5, 7, 9, 11),
      sanitize.colnames.function = bold, include.rownames = F,
      booktabs = T, scalebox = 0.65)

# > Forced-choice groups ----

# Create dummy variables for each comparison
df <- df %>% 
  mutate(fox_ent = case_when(article_forced == "Fox" ~ 1,
                             article_forced == "Entertainment" ~ 0),
         msnbc_ent = case_when(article_forced == "MSNBC" ~ 1,
                               article_forced == "Entertainment" ~ 0),
         fox_msnbc = case_when(article_forced == "Fox" ~ 1,
                               article_forced == "MSNBC" ~ 0))

# Assess balance across media exposure in forced-choice group (using t-tests)
group_means_forced <- as.data.frame(do.call("rbind", lapply(group_vars, function(x){
  # For each variable, filter out missing data
  filt <- df %>% 
    mutate(group = get(x)) %>% 
    filter(!is.na(group) & !is.na(article_forced))
  
  out <- filt %>%
    group_by(article_forced, group) %>%
    summarise(n = n()) %>%
    mutate(freq = n / sum(n)) %>% 
    select(article_forced, group, freq) %>% 
    pivot_wider(names_from = article_forced, values_from = freq) %>% 
    as.data.frame()
  
  colnames(out) <- c("Category", "MSNBC", "Fox", "Entertainment")
  
  # Number of unique groups
  filt$group <- factor(filt$group)
  groups <- levels(filt$group)
  
  out$p_foxent <- sapply(groups, function(y){
    t.test((group == y) ~ article_forced, data = filt %>% filter(article_forced != "MSNBC"))$p.value
  })
  out$p_msnbcent <- sapply(groups, function(y){
    t.test((group == y) ~ article_forced, data = filt %>% filter(article_forced != "Fox"))$p.value
  })
  out$p_foxmsnbc <- sapply(groups, function(y){
    t.test((group == y) ~ article_forced, data = filt %>% filter(article_forced != "Entertainment"))$p.value
  })
  
  out
})))

# Add descriptive row labels
group_means_forced$Category <- labels
group_means_forced <- group_means_forced %>% 
  select(Category, Entertainment, Fox, MSNBC, 
         `$p$: Fox vs. Ent.` = p_foxent,
         `$p$: MSNBC vs. Ent.` = p_msnbcent,
         `$p$: Fox vs. MSNBC` = p_foxmsnbc)

# Table A3: balance within forced-choice group
print(xtable(group_means_forced, digits = 2, 
             align = c(rep("l", 2), rep("c", 6)),
             caption = "Demographic balance in media exposure within forced-choice group. Reported $p$ values come from independent samples t-tests. For income, partisanship, and ideology, missing data are excluded.",
             label = "bal_article_forced"),
      hline.after = c(-1, 0, 2, 8, 12, 16, 19, 22, 25, 28),
      sanitize.colnames.function=bold, include.rownames = F,
      booktabs = T, scalebox = 0.65, 
      file = paste0(table_path, "/tab_a3.tex"))

# Robustness check: assessing balance using Matching/ebal packages
force_balance <- function(comp = "fox_msnbc"){
  form <- as.formula(comp_case ~ age + (gender == 1) + race_white + (educ > 3) + 
                       (income > 7) + factor(pid, levels = c(0, -1, 1)) +
                       factor(ideo, levels = c(0, -1, 1)) + 
                       factor(med_pref, levels = c("Entertainment",
                                                   "Fox", "MSNBC")))
  
  # Check balance
  matchbal <- MatchBalance(form, data = df %>% mutate(comp_case = get(comp)), 
                           digits = 2, print.level = 0) 
  
  # Collect results into a table
  table <- baltest.collect(matchbal, 
                           var.names = c("Age", "Male", "White", 
                                         "College Degree or More",
                                         "$50K or More",
                                         "Democrat", "Republican",
                                         "Liberal", "Conservative",
                                         "Prefer Fox", "Prefer MSNBC"), after = F)
  
  # Retain relevant table output
  table <- table[ ,c(1:2,6)] %>% as.data.frame()
  
  # Format table
  cat1 <- case_when(str_detect(comp, "fox_") ~ "Fox",
                    str_detect(comp, "msnbc_") ~ "MSNBC")
  cat2 <- case_when(str_detect(comp, "_msnbc") ~ "MSNBC",
                    str_detect(comp, "_ent") ~ "Entertainment")
  colnames(table) <- c(cat1, cat2, "$p$")
  table$Category <- rownames(table)
  
  return(table)
}

matchbal.foxmsnbc.table <- force_balance(comp = "fox_msnbc")
matchbal.foxent.table <- force_balance(comp = "fox_ent")
matchbal.msnbcent.table <- force_balance(comp = "msnbc_ent")

# Merge everything
merged_table <- cbind.data.frame(matchbal.foxmsnbc.table, 
                                 matchbal.foxent.table[, c("Entertainment", "$p$")],
                                 matchbal.msnbcent.table[, c("$p$")])
colnames(merged_table) <- c("Fox", "MSNBC", "$p$: Fox vs. MSNBC", "Category",
                            "Entertainment", "$p$: Fox vs. Entertainment",
                            "$p$: MSNBC vs. Entertainment")

merged_table <- merged_table %>% 
  select(Category, Fox, MSNBC, Entertainment,
         `$p$: Fox vs. MSNBC`, `$p$: Fox vs. Entertainment`, `$p$: MSNBC vs. Entertainment`)

print(xtable(merged_table, digits = 2, 
             align = c(rep("l", 2), rep("c", 3), "|c", rep("c", 2)),
             caption = "Demographic balance of media exposure within the forced-choice group, calculated using the \\texttt{Matching} and \\texttt{ebal} packages in \\texttt{R}. The reference categories are ``Independent'' for partisanship, ``Moderate'' for ideology, and ``Prefer Entertainment'' for stated media preferences. All remaining variables (other than age) are treated as binary. Respondents who have missing data for one or more of these variables are excluded from the reported results.",
             label = "bal_article_forced"),
      hline.after = c(-1, 0, 1, 2, 3, 4, 5, 7, 9, 11),
      sanitize.colnames.function = bold, include.rownames = F,
      booktabs = T, scalebox = 0.65)
